/*
Creates choice set data for use in all MATLAB estimation (general_entry_choice_data.txt)

Differences from the previous workflow:
- Need NDC numbers now to verify intensity
- Add in diagnosed but not taking for extensive margin


*/


// Step 1: create a user quarter panel that includes the NDC number


clear
set more off
use enrolid plankey genind ndcnum svcdate using ccaed1996

// filter data: this has changed
merge m:1 ndcnum using "compressed_redbook_w_indications", keepus(generic_id indic_id xr) keep(mat)
keep if indic_id == 5
drop _merge

// start year necessary for generating first use indicator (relevant for reduced form)
merge m:1 enrolid using "all_enrolid_info", keep(mat) keepus(start_year)
drop _merge



forvalues j=1997/2013 {
	append using ccaed`j', keep(enrolid plankey genind ndcnum svcdate)
	
	drop generic_id indic_id xr start_year
	
	merge m:1 ndcnum using "compressed_redbook_w_indications", keepus(generic_id indic_id xr) keep(mat)
	keep if indic_id == 5
	drop _merge
	
	// filter data (keep size down)
	merge m:1 enrolid using "all_enrolid_info", keep(mat) keepus(start_year)
	drop _merge
	
}


///////////////
// END SETUP //
///////////////

keep if indic_id == 5 // start by looking at cholesterol (but code general enough to handle several)


// generics: genind (type of situation and generic vs. brand)
destring genind, replace force
keep if genind >= 1 & genind <= 5

gen generic = (genind == 4) | (genind == 5)
gen generic_available = (genind == 3)


// tack on Redbook information to get ingredient name
// get things like: indic_id, generic name, extended release version (indication from Cortellis)
drop genind


// figure out when a user first starts using anything in the class
sort enrolid svcdate generic // go with brand if multiple of the same day
egen first_use = tag(enrolid)

// label new subscription (first and six months after the start of the sample)
replace first_use = 0 if first_use == 1 & svcdate < mdy(6,1,start_year) 


gen year = year(svcdate)
gen quarter = qofd(svcdate)
tab quarter

gen prescriptions = 1

// Crude: get drug with most prescriptions in a given year
// 1 entry per enrollee, year, and indication
// additional variables: first year using drugs in some area (allows us to see if drug used in entry year)
collapse (sum) prescriptions (max) first_use plankey (first) year, by(enrolid ndcnum generic quarter)


// patch for dealing with multiple prescriptions (still want to record that this quarter was the first
egen first_use_temp = max(first_use), by(enrolid quarter)
replace first_use = first_use_temp
drop first_use_temp

// problematic step: will drop first use if first use involves two drugs
gsort enrolid quarter - prescriptions ndcnum // drugid for multiple prescription cases (need to clean this up later)

egen t = tag(enrolid quarter)
keep if t == 1 // keep top prescription for the quarter
drop t


sort enrolid quarter

tab plankey, m

save user_quarter_choice_cholesterol_ndcnum, replace 









// Step 2: find plankey for all relevant cholesterol users; take a 20% sample
clear
gen enrolid = .
gen year = .
save cholesterol_relevant_user_plankeys, replace


set more off
forvalues year = 1996/2006 {

	cd "/disk/aging/mktscan/nongeo/data/100pct" 
	use enrolid plnkey* using ccaea`year', clear 
	
	display `year'

	gen year = `year'
	
	gen plankey = .
	
	forvalues i=1/12 {
	
		replace plankey = plnkey`i' if plankey == .
	}
	
	
	// merge with cholesterol list
	merge m:1 enrolid using cholesterol_all_relevant_users, keep(3)
	drop _merge
	
	keep enrolid year plankey
	
	
	append using cholesterol_relevant_user_plankeys
	save cholesterol_relevant_user_plankeys, replace
}

forvalues year = 2007/2009 {

	//capture use enrolid plnkey* using ccaea`year', clear 
	use enrolid PLNKEY* using ccaea`year', clear 
	
	display `year'

	gen year = `year'
	
	gen plankey = .
	
	forvalues i=1/12 {
	
		//capture replace plankey = plnkey`i' if plankey == .
		replace plankey = PLNKEY`i' if plankey == .
	}
	
	
	// merge with cholesterol list
	merge m:1 enrolid using cholesterol_all_relevant_users, keep(3)
	drop _merge
	
	keep enrolid year plankey
	
	
	append using cholesterol_relevant_user_plankeys
	save cholesterol_relevant_user_plankeys, replace
}

forvalues year = 2010/2013 {

	use enrolid plnkey* using ccaea`year', clear 
	//capture use enrolid PLNKEY* using ccaea`year', clear 
	
	display `year'

	gen year = `year'
	
	gen plankey = .
	
	forvalues i=1/12 {
	
		replace plankey = plnkey`i' if plankey == .
		//capture replace plankey = PLNKEY`i' if plankey == .
	}
	
	
	// merge with cholesterol list
	merge m:1 enrolid using cholesterol_all_relevant_users, keep(3)
	drop _merge
	
	keep enrolid year plankey
	

	append using cholesterol_relevant_user_plankeys
	save cholesterol_relevant_user_plankeys, replace
}


keep enrolid year plankey

compress
save cholesterol_relevant_user_plankeys, replace




clear
use cholesterol_relevant_user_plankeys // enrolid, year, plankey

drop if plankey == .

keep enrolid

duplicates drop

gen r = runiform()
keep if r <= 0.25

save relevant_user_sample, replace











// Step 3: Convert the NDC claims data into a choice dataset



// 3A) create plan-level information on copays facing each patient for each choice (from raw data)


clear
set more off
use enrolid plankey genind ndcnum svcdate copay coins pay awp qty daysupp using ccaed1996

// filter data
merge m:1 ndcnum using "compressed_redbook_w_indications", keepus(generic_id indic_id xr) keep(mat)
drop _merge
keep if indic_id == 5


forvalues j=1997/2013 {

	append using ccaed`j', keep(enrolid plankey genind ndcnum svcdate copay coins pay awp qty daysupp)
	
	// filter data
	drop generic_id indic_id xr
	merge m:1 ndcnum using "compressed_redbook_w_indications", keepus(generic_id indic_id xr) keep(mat)
	drop _merge
	
	keep if indic_id == 5
}


///////////////
// END SETUP //
///////////////

gen year = year(svcdate)

// generics: genind (type of situation and generic vs. brand)
destring genind, replace force
keep if genind >= 1 & genind <= 5

gen generic = (genind == 4) | (genind == 5)
gen generic_available = (genind == 3)


// tack on Redbook information to get ingredient name
// get things like: indic_id, generic name, extended release version (indication from Cortellis)
drop genind

// correction
replace generic = 0 if year == 1997 | year == 1998
replace generic = 0 if year == 2002 & generic_id == 6190



// merge on strength and gennme info
merge m:1 ndcnum using "redbook", keepus(prodnme gennme strngth manfnme) keep(mat)
drop _merge

// Merge on dosage class info: get class info
merge m:1 gennme strngth using "strngth_cholesterol_drugs"
keep if _merge == 3
drop _merge


// CREATE QUARTERLY STATISTICS ON PATIENTS ON EACH DRUG

// look at stuff at the ndcnum x month level
gen month = mofd(svcdate)
gen quarter = qofd(svcdate)

gen cost_sharing = copay + coins
gen entries = 1

// no numbered plans have Vytorin? Previous bug (append error)
keep if plankey != .




// NORMALIZED VERSION

// minor edit
replace daysupp = 30 if daysupp == 0

replace cost_sharing = cost_sharing / daysupp * 30
replace pay = pay / daysupp * 30
replace awp = awp / daysupp * 30



// collapse to find the median
collapse (p50) cost_sharing pay awp (sd) sd_cost_sharing = cost_sharing (first) manfnme (count) entries, by(plankey gennme class generic xr quarter)

save plankey_cholesterol_menus_median_norm, replace


keep plankey quarter class
duplicates drop

// for now, just have an outside option for class 2 (most popular), maybe need it for other classes as well
gen gennme = "Outside Option"

save plankey_quarter_temp, replace


// combine the two datasets to tack on outside option for each plankey by quarter by class
clear
use plankey_cholesterol_menus_median_norm

append using plankey_quarter_temp, gen(outside)

gen brand = 1-generic

foreach v in generic brand cost_sharing xr pay {

	replace `v' = 0 if outside == 1
	
}

sort plankey quarter class gennme

compress
save plankey_cholesterol_menus_median_norm_augmented, replace




// 3B) use claims data and merge with the menu for each plankey

clear
use user_quarter_choice_cholesterol_ndcnum // 42% of entries have plankeys


merge m:1 enrolid using "relevant_user_sample" // 25% sample to ease computational burden
keep if _merge == 3
drop _merge


// C) tack on the plankeys, even for those who are on the extensive margin

capture drop year
gen year = year(dofq(quarter))
drop plankey

merge m:1 enrolid year using "cholesterol_relevant_user_plankeys"
keep if _merge == 3
drop _merge


// 1) merge in strngth and gennme information for the current choice
merge m:1 ndcnum using "redbook", keepus(prodnme gennme strngth mstfmds) keep(mat)
drop _merge

compress

gen xr = strpos(mstfmds, "Extended Release") > 0
drop mstfmds


// merge on the level of cholesterol reduction: class
merge m:1 gennme strngth using strngth_cholesterol_drugs
keep if _merge == 3
drop _merge

// order by choice
sort enrolid quarter

// Create previous and current choices (ignores spaces)
foreach v in gennme generic xr class {
	gen prev_`v' = `v'[_n-1] if enrolid == enrolid[_n-1]
	rename `v' curr_`v'
}

gen class = curr_class // for merging purposes



/*
EXTENSIVE MARGIN
*/

append using cholesterol_first_diagnosis, gen(diagnosis_entry) // does not have a quarter attached
gen diagnosis_quarter = qofd(first_diagnosis_date)

append using enrolid_end_quarter_cholesterol_relevant, gen(end) // does not have a quarter attached
gen end_quarter = qofd(end_date)


// merge in plankey for these users as well
drop plankey
merge m:1 enrolid year using "cholesterol_relevant_user_plankeys"
keep if _merge == 3
drop _merge


merge m:1 enrolid using "relevant_user_sample" // 25% sample
keep if _merge == 3
drop _merge

/////////////////////
// PROCESSING DATA //
/////////////////////


// 1) filter out diagnosis info that is irrelevant: comes after prescriptions start
egen min_quarter = min(quarter), by(enrolid) // first quarter with prescription
gen min_prescription_date = dofq(min_quarter)
egen min_diagnosis_date = min(first_diagnosis_date), by(enrolid)
drop if diagnosis_entry == 1 & diagnosis_quarter >= min_quarter // keep diagnosis events only if they create a distinct quarter

// 2) filter out end date info that is irrelevant for end date
egen max_quarter = max(quarter), by(enrolid) // first quarter with prescription
drop if end == 1 & end_quarter <= max_quarter



// A) fill in entries for unmatched people (from first diagnosis quarter to end of 2012)
// B) also fill in entries for matched people based on when they are first diagnosed until
replace quarter = diagnosis_quarter if quarter == . & diagnosis_quarter != . // sets marker from which to fill out stuff
replace quarter = end_quarter if quarter == . & end_quarter != .

// KEY STEP: create blank entries with just enrolid quarter for missing
sort enrolid quarter
tsset enrolid quarter
tsfill 

gen extensive = (curr_gennme == "")
egen all_extensive = min(extensive), by(enrolid)
gen all_extensive_user = (all_extensive == 1)



// Fill in info for outside option
tab curr_gennme

// NOTE: previously filtered out missing quarters for people on treatment
// drop if curr_generic == . & quarter >= min_quarter & all_extensive_user == 0 

// fills in gaps and for all extensive people
replace curr_gennme = "Outside Option" if curr_gennme == ""
foreach v in curr_generic curr_xr { 
	replace `v' = 0 if `v' == .
}

tab curr_gennme



/*
New estimation issues:
- Gaps in the plankey availability for a given user
- Probably want to check that incumbent makes sense
- Also want to drop null entries if they didn't have a valid plankey that quarter (dropped those in the using already)
*/

// fill in any plankeys
egen max_plankey = max(plankey), by(enrolid year)
replace plankey = max_plankey if plankey == .

// fill in missing class information
sort enrolid quarter
replace class = 2 if class == . & all_extensive_user == 1
replace class = class[_n-1] if enrolid == enrolid[_n-1] & class == .



// still go with class = 2 for now => just have the extra extensive 
// this is optional, though

// FINAL FILTER
keep if class == 2
drop if plankey == .
// END FINAL FILTER


egen first_quarter = min(quarter), by(enrolid) // first quarter for both takers and diagnosed

// joinby on any options in the class offered by the plankey
// augmented plankey data has class = 2 for outside option
joinby plankey quarter class using plankey_cholesterol_menus_median_norm_augmented // has an outside option for each class
gen chosen = (curr_gennme == gennme) & (curr_generic == generic) & (curr_xr == xr)
tab chosen
sort enrolid quarter


tab curr_gennme


// Incumbent: defined for now based on previous class 2 choice
gen incumbent = (gennme == prev_gennme) & prev_gennme != "Outside Option" // the gennme associated with the choice is the same as previous period choice
tab incumbent chosen


// NEW: save quarterly choice data
save quarterly_class2_choice_dataset, replace
















// Step 3: Process the choice dataset to focus on large pools of treatment and control users

use quarterly_class2_choice_dataset, clear

merge m:1 enrolid using "user_treatment_indicators_cholesterol_long_control"
keep if _merge == 3
drop _merge

gen entry_quarter = qofd(entry_date)

keep if year >= 1996 & year <= 2009
keep if entry_num == 1 | entry_num == 9 | entry_num == 12 // Crestor entry event and generic Zocor
keep if gennme == "Atorvastatin Calcium" | gennme == "Simvastatin" | gennme == "Rosuvastatin Calcium"

// 12k people left
egen t = tag(enrolid)
tab t
drop t

summ cost_sharing, detail // enough variation?

// reformat into what we need
gen drugid = 1 if gennme == "Simvastatin" & generic == 0
replace drugid = 2 if gennme == "Atorvastatin Calcium"
replace drugid = 3 if gennme == "Rosuvastatin Calcium"
replace drugid = 4 if gennme == "Simvastatin" & generic == 1

tab entry_quarter // should have 2
drop if entry_num == 1 & drugid == 2 & control == 1 & quarter == entry_quarter 
drop if entry_num == 9 & drugid == 3 & control == 1 & quarter == entry_quarter // assume not in choice set for control group for clarity
drop if entry_num == 12 & drugid == 4 & control == 1 & quarter == entry_quarter 


egen personid = group(enrolid)
keep personid quarter drugid chosen cost_sharing incumbent control treat entry_num
order personid quarter drugid chosen cost_sharing incumbent control treat entry_num

sort personid quarter drugid

// sanity check
tab chosen if drugid == 3 & control == 1
tab chosen if drugid == 3 & treat == 1

keep if quarter >= 146 // still have the right incumbent info (start in June 1996)

replace quarter = quarter - 145 // re-index to start at 1

tab entry_num

// can also separate this by entry_num
save general_entry_choice_data, replace
export delimited using "general_entry_choice_data.txt", replace novar

 
